sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.5.1 magrittr_1.5 tools_3.5.1 htmltools_0.3.6
## [5] yaml_2.2.0 Rcpp_1.0.0 stringi_1.2.4 rmarkdown_1.11
## [9] knitr_1.20 stringr_1.3.1 digest_0.6.18 evaluate_0.12
output.var = params$output.var
log.pred = params$log.pred
eda = params$eda
algo.forward = params$algo.forward
algo.backward = params$algo.backward
algo.stepwise = params$algo.stepwise
algo.LASSO = params$algo.LASSO
algo.LARS = params$algo.LARS
message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 8
## $ output.var : chr "y3"
## $ log.pred : logi FALSE
## $ eda : logi TRUE
## $ algo.forward : logi FALSE
## $ algo.backward: logi FALSE
## $ algo.stepwise: logi FALSE
## $ algo.LASSO : logi FALSE
## $ algo.LARS : logi FALSE
# Setup Labels
# alt.scale.label.name = Alternate Scale variable name
# - if predicting on log, then alt.scale is normal scale
# - if predicting on normal scale, then alt.scale is log scale
if (log.pred == TRUE){
label.names = paste('log.',output.var,sep="")
alt.scale.label.name = output.var
}
if (log.pred == FALSE){
label.names = output.var
alt.scale.label.name = paste('log.',output.var,sep="")
}
features = read.csv("../../Data/features.csv")
#str(features)
corr.matrix = round(cor(features[sapply(features, is.numeric)]),2)
# filter out only highly correlated variables
threshold = 0.6
corr.matrix.tmp = corr.matrix
diag(corr.matrix.tmp) = 0
high.corr = apply(abs(corr.matrix.tmp) >= threshold, 1, any)
high.corr.matrix = corr.matrix.tmp[high.corr, high.corr]
DT::datatable(corr.matrix)
DT::datatable(high.corr.matrix)
feature.names = colnames(features)
drops <- c('JobName')
feature.names = feature.names[!(feature.names %in% drops)]
#str(feature.names)
labels = read.csv("../../Data/labels.csv")
#str(labels)
labels = labels[,c("JobName", output.var)]
summary(labels)
## JobName y3
## Job_00001: 1 Min. : 95.91
## Job_00002: 1 1st Qu.:118.21
## Job_00003: 1 Median :123.99
## Job_00004: 1 Mean :125.36
## Job_00005: 1 3rd Qu.:131.06
## Job_00006: 1 Max. :193.73
## (Other) :9994 NA's :2497
data <- merge(features, labels, by = 'JobName')
drops <- c('JobName')
data = data[,(!colnames(data) %in% drops)]
#str(data)
#str(data)
if (log.pred == TRUE){
data[label.names] = log(data[alt.scale.label.name],10)
drops = c(alt.scale.label.name)
data = data[!(names(data) %in% drops)]
}
#str(data)
data = data[complete.cases(data),]
if (eda == TRUE){
corr.to.label =round(cor(dplyr::select(data,-one_of(label.names)),dplyr::select_at(data,label.names)),4)
DT::datatable(corr.to.label)
}
if (eda == TRUE){
vifDF = usdm::vif(select_at(data,feature.names)) %>% arrange(desc(VIF))
head(vifDF,10)
}
## Variables VIF
## 1 stat31 1.066774
## 2 stat113 1.059555
## 3 stat98 1.059344
## 4 stat105 1.059191
## 5 x22 1.058846
## 6 stat206 1.058561
## 7 stat178 1.058379
## 8 stat179 1.058364
## 9 stat142 1.058288
## 10 stat171 1.057939
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
if (eda == TRUE){
hist(data[ ,label.names])
#hist(data[complete.cases(data),alt.scale.label.name])
}
# https://stackoverflow.com/questions/24648729/plot-one-numeric-variable-against-n-numeric-variables-in-n-plots
ind.pairs.plot <- function(data, xvars=NULL, yvar)
{
df <- data
if (is.null(xvars)) {
xvars = names(data[which(names(data)!=yvar)])
}
#choose a format to display charts
ncharts <- length(xvars)
for(i in 1:ncharts){
plot(df[,xvars[i]],df[,yvar], xlab = xvars[i], ylab = yvar)
}
}
if (eda == TRUE){
ind.pairs.plot(data, feature.names, label.names)
}
if(eda ==FALSE){
# x18 may need transformations
plot(data[,'x18'], data[,label.names], main = "Original Scatter Plot vs. x18", ylab = label.names, xlab = 'x18')
plot(sqrt(data[,'x18']), data[,label.names], main = "Original Scatter Plot vs. sqrt(x18)", ylab = label.names, xlab = 'sqrt(x18)')
# transforming x18
data$sqrt.x18 = sqrt(data$x18)
data = dplyr::select(data,-one_of('x18'))
# what about x7, x9?
# x11 looks like data is at discrete points after a while. Will this be a problem?
}
data = data[sample(nrow(data)),] # randomly shuffle data
split = sample.split(data[,label.names], SplitRatio = 0.8)
data.train = subset(data, split == TRUE)
data.test = subset(data, split == FALSE)
plot.diagnostics <- function(model, train) {
plot(model)
residuals = resid(model) # Plotted above in plot(lm.out)
r.standard = rstandard(model)
r.student = rstudent(model)
plot(predict(model,train),r.student,
ylab="Student Residuals", xlab="Predicted Values",
main="Student Residual Plot")
abline(0, 0)
plot(predict(model, train),r.standard,
ylab="Standard Residuals", xlab="Predicted Values",
main="Standard Residual Plot")
abline(0, 0)
abline(2, 0)
abline(-2, 0)
# Histogram
hist(r.student, freq=FALSE, main="Distribution of Studentized Residuals",
xlab="Studentized Residuals", ylab="Density", ylim=c(0,0.5))
# Create range of x-values for normal curve
xfit <- seq(min(r.student)-1, max(r.student)+1, length=40)
# Generate values from the normal distribution at the specified values
yfit <- (dnorm(xfit))
# Add the normal curve
lines(xfit, yfit, ylim=c(0,0.5))
}
n <- names(data.train)
formula <- as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~", paste(n[!n %in% label.names], collapse = " + ")))
grand.mean.formula = as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~ 1"))
print(formula)
## y3 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 +
## x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 +
## x22 + x23 + stat1 + stat2 + stat3 + stat4 + stat5 + stat6 +
## stat7 + stat8 + stat9 + stat10 + stat11 + stat12 + stat13 +
## stat14 + stat15 + stat16 + stat17 + stat18 + stat19 + stat20 +
## stat21 + stat22 + stat23 + stat24 + stat25 + stat26 + stat27 +
## stat28 + stat29 + stat30 + stat31 + stat32 + stat33 + stat34 +
## stat35 + stat36 + stat37 + stat38 + stat39 + stat40 + stat41 +
## stat42 + stat43 + stat44 + stat45 + stat46 + stat47 + stat48 +
## stat49 + stat50 + stat51 + stat52 + stat53 + stat54 + stat55 +
## stat56 + stat57 + stat58 + stat59 + stat60 + stat61 + stat62 +
## stat63 + stat64 + stat65 + stat66 + stat67 + stat68 + stat69 +
## stat70 + stat71 + stat72 + stat73 + stat74 + stat75 + stat76 +
## stat77 + stat78 + stat79 + stat80 + stat81 + stat82 + stat83 +
## stat84 + stat85 + stat86 + stat87 + stat88 + stat89 + stat90 +
## stat91 + stat92 + stat93 + stat94 + stat95 + stat96 + stat97 +
## stat98 + stat99 + stat100 + stat101 + stat102 + stat103 +
## stat104 + stat105 + stat106 + stat107 + stat108 + stat109 +
## stat110 + stat111 + stat112 + stat113 + stat114 + stat115 +
## stat116 + stat117 + stat118 + stat119 + stat120 + stat121 +
## stat122 + stat123 + stat124 + stat125 + stat126 + stat127 +
## stat128 + stat129 + stat130 + stat131 + stat132 + stat133 +
## stat134 + stat135 + stat136 + stat137 + stat138 + stat139 +
## stat140 + stat141 + stat142 + stat143 + stat144 + stat145 +
## stat146 + stat147 + stat148 + stat149 + stat150 + stat151 +
## stat152 + stat153 + stat154 + stat155 + stat156 + stat157 +
## stat158 + stat159 + stat160 + stat161 + stat162 + stat163 +
## stat164 + stat165 + stat166 + stat167 + stat168 + stat169 +
## stat170 + stat171 + stat172 + stat173 + stat174 + stat175 +
## stat176 + stat177 + stat178 + stat179 + stat180 + stat181 +
## stat182 + stat183 + stat184 + stat185 + stat186 + stat187 +
## stat188 + stat189 + stat190 + stat191 + stat192 + stat193 +
## stat194 + stat195 + stat196 + stat197 + stat198 + stat199 +
## stat200 + stat201 + stat202 + stat203 + stat204 + stat205 +
## stat206 + stat207 + stat208 + stat209 + stat210 + stat211 +
## stat212 + stat213 + stat214 + stat215 + stat216 + stat217
print(grand.mean.formula)
## y3 ~ 1
# Update feature.names because we may have transformed some features
feature.names = n[!n %in% label.names]
model.full = lm(formula , data.train)
summary(model.full)
##
## Call:
## lm(formula = formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.422 -6.067 -1.711 4.532 55.809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.453e+01 2.722e+00 34.732 < 2e-16 ***
## x1 -1.330e-01 1.903e-01 -0.699 0.484468
## x2 4.795e-02 1.212e-01 0.395 0.692526
## x3 -5.789e-03 3.325e-02 -0.174 0.861791
## x4 -1.320e-02 2.636e-03 -5.008 5.67e-07 ***
## x5 1.534e-01 8.609e-02 1.782 0.074860 .
## x6 2.002e-01 1.730e-01 1.157 0.247261
## x7 3.316e+00 1.850e-01 17.928 < 2e-16 ***
## x8 1.459e-01 4.332e-02 3.367 0.000764 ***
## x9 8.738e-01 9.643e-02 9.062 < 2e-16 ***
## x10 3.455e-01 8.987e-02 3.844 0.000122 ***
## x11 7.188e+07 2.150e+07 3.342 0.000836 ***
## x12 -2.805e-02 5.463e-02 -0.514 0.607593
## x13 1.035e-02 2.171e-02 0.477 0.633730
## x14 -1.508e-01 9.388e-02 -1.606 0.108301
## x15 -6.076e-02 8.991e-02 -0.676 0.499167
## x16 3.394e-01 6.248e-02 5.433 5.76e-08 ***
## x17 3.983e-01 9.437e-02 4.221 2.47e-05 ***
## x18 1.661e+00 6.659e-02 24.949 < 2e-16 ***
## x19 2.794e-02 4.799e-02 0.582 0.560433
## x20 -2.343e-01 3.338e-01 -0.702 0.482810
## x21 3.772e-02 1.237e-02 3.049 0.002305 **
## x22 -1.623e-02 1.006e-01 -0.161 0.871817
## x23 -1.400e-02 9.589e-02 -0.146 0.883918
## stat1 -4.559e-02 7.223e-02 -0.631 0.527927
## stat2 -2.650e-02 7.198e-02 -0.368 0.712820
## stat3 1.420e-01 7.268e-02 1.954 0.050796 .
## stat4 -9.102e-02 7.284e-02 -1.250 0.211526
## stat5 -3.402e-02 7.228e-02 -0.471 0.637858
## stat6 -7.023e-02 7.232e-02 -0.971 0.331530
## stat7 -1.260e-02 7.213e-02 -0.175 0.861289
## stat8 4.356e-02 7.200e-02 0.605 0.545226
## stat9 -1.789e-02 7.179e-02 -0.249 0.803224
## stat10 -1.238e-01 7.182e-02 -1.724 0.084824 .
## stat11 -4.192e-02 7.269e-02 -0.577 0.564185
## stat12 4.300e-02 7.198e-02 0.597 0.550204
## stat13 -1.272e-01 7.135e-02 -1.783 0.074708 .
## stat14 -2.366e-01 7.195e-02 -3.289 0.001012 **
## stat15 -4.286e-02 7.170e-02 -0.598 0.550049
## stat16 -1.129e-02 7.180e-02 -0.157 0.875027
## stat17 -8.145e-02 7.167e-02 -1.136 0.255826
## stat18 -7.144e-02 7.167e-02 -0.997 0.318904
## stat19 4.091e-02 7.189e-02 0.569 0.569324
## stat20 -3.753e-02 7.195e-02 -0.522 0.601956
## stat21 -1.594e-02 7.268e-02 -0.219 0.826365
## stat22 -1.089e-01 7.235e-02 -1.506 0.132163
## stat23 1.806e-01 7.198e-02 2.509 0.012130 *
## stat24 -1.677e-01 7.204e-02 -2.327 0.019980 *
## stat25 -1.343e-01 7.183e-02 -1.870 0.061472 .
## stat26 -8.663e-02 7.207e-02 -1.202 0.229386
## stat27 1.452e-02 7.208e-02 0.201 0.840387
## stat28 2.085e-02 7.226e-02 0.289 0.772938
## stat29 1.210e-01 7.268e-02 1.665 0.095993 .
## stat30 8.621e-02 7.303e-02 1.181 0.237849
## stat31 -3.783e-02 7.270e-02 -0.520 0.602818
## stat32 7.471e-02 7.266e-02 1.028 0.303914
## stat33 -1.148e-01 7.201e-02 -1.594 0.111010
## stat34 4.494e-02 7.220e-02 0.622 0.533686
## stat35 -1.122e-01 7.200e-02 -1.558 0.119303
## stat36 4.093e-02 7.141e-02 0.573 0.566604
## stat37 -1.190e-01 7.247e-02 -1.643 0.100501
## stat38 1.107e-01 7.231e-02 1.531 0.125747
## stat39 -7.847e-03 7.145e-02 -0.110 0.912544
## stat40 6.254e-02 7.201e-02 0.868 0.385182
## stat41 -1.538e-01 7.142e-02 -2.154 0.031316 *
## stat42 -1.341e-01 7.167e-02 -1.871 0.061454 .
## stat43 -9.789e-02 7.247e-02 -1.351 0.176813
## stat44 4.461e-02 7.165e-02 0.623 0.533544
## stat45 -6.899e-02 7.165e-02 -0.963 0.335654
## stat46 1.034e-01 7.194e-02 1.438 0.150519
## stat47 3.190e-02 7.245e-02 0.440 0.659713
## stat48 5.232e-02 7.230e-02 0.724 0.469336
## stat49 1.245e-01 7.138e-02 1.744 0.081258 .
## stat50 3.205e-02 7.158e-02 0.448 0.654381
## stat51 1.323e-01 7.214e-02 1.834 0.066767 .
## stat52 -2.793e-02 7.256e-02 -0.385 0.700286
## stat53 -4.729e-02 7.272e-02 -0.650 0.515498
## stat54 -1.188e-01 7.261e-02 -1.636 0.101882
## stat55 9.055e-02 7.123e-02 1.271 0.203660
## stat56 -4.853e-02 7.194e-02 -0.675 0.500001
## stat57 -2.709e-02 7.155e-02 -0.379 0.705017
## stat58 -6.057e-02 7.155e-02 -0.847 0.397271
## stat59 -8.178e-03 7.216e-02 -0.113 0.909777
## stat60 1.514e-01 7.213e-02 2.099 0.035904 *
## stat61 4.074e-02 7.236e-02 0.563 0.573506
## stat62 -7.502e-02 7.158e-02 -1.048 0.294639
## stat63 6.351e-02 7.198e-02 0.882 0.377673
## stat64 -7.909e-02 7.176e-02 -1.102 0.270485
## stat65 -1.305e-01 7.252e-02 -1.800 0.071951 .
## stat66 8.899e-02 7.291e-02 1.220 0.222339
## stat67 2.169e-02 7.237e-02 0.300 0.764442
## stat68 -2.725e-02 7.206e-02 -0.378 0.705370
## stat69 -5.561e-02 7.175e-02 -0.775 0.438394
## stat70 1.145e-01 7.179e-02 1.595 0.110863
## stat71 -3.144e-02 7.157e-02 -0.439 0.660442
## stat72 3.687e-02 7.230e-02 0.510 0.610097
## stat73 1.103e-01 7.214e-02 1.529 0.126217
## stat74 -9.601e-02 7.226e-02 -1.329 0.184028
## stat75 1.414e-02 7.267e-02 0.195 0.845736
## stat76 6.563e-02 7.189e-02 0.913 0.361283
## stat77 1.651e-02 7.221e-02 0.229 0.819150
## stat78 -2.063e-02 7.206e-02 -0.286 0.774694
## stat79 -3.536e-02 7.246e-02 -0.488 0.625533
## stat80 2.830e-02 7.285e-02 0.389 0.697640
## stat81 8.520e-02 7.227e-02 1.179 0.238482
## stat82 3.443e-03 7.184e-02 0.048 0.961781
## stat83 -1.189e-02 7.195e-02 -0.165 0.868770
## stat84 1.453e-02 7.229e-02 0.201 0.840716
## stat85 8.610e-03 7.213e-02 0.119 0.904994
## stat86 3.014e-02 7.215e-02 0.418 0.676172
## stat87 -3.122e-02 7.250e-02 -0.431 0.666747
## stat88 -1.306e-02 7.147e-02 -0.183 0.854992
## stat89 -5.852e-02 7.173e-02 -0.816 0.414655
## stat90 -4.459e-02 7.218e-02 -0.618 0.536812
## stat91 -1.601e-01 7.166e-02 -2.234 0.025505 *
## stat92 -9.821e-02 7.194e-02 -1.365 0.172267
## stat93 -3.398e-02 7.293e-02 -0.466 0.641298
## stat94 -1.970e-02 7.197e-02 -0.274 0.784301
## stat95 -4.110e-02 7.195e-02 -0.571 0.567816
## stat96 -6.225e-02 7.241e-02 -0.860 0.390005
## stat97 -7.469e-03 7.136e-02 -0.105 0.916636
## stat98 9.945e-01 7.121e-02 13.966 < 2e-16 ***
## stat99 9.422e-02 7.220e-02 1.305 0.191954
## stat100 1.982e-01 7.239e-02 2.739 0.006189 **
## stat101 -8.971e-02 7.312e-02 -1.227 0.219955
## stat102 4.520e-02 7.221e-02 0.626 0.531434
## stat103 -3.401e-02 7.326e-02 -0.464 0.642512
## stat104 -6.126e-02 7.244e-02 -0.846 0.397783
## stat105 7.291e-02 7.152e-02 1.019 0.308043
## stat106 -5.050e-02 7.180e-02 -0.703 0.481828
## stat107 -3.187e-02 7.232e-02 -0.441 0.659437
## stat108 -6.683e-02 7.209e-02 -0.927 0.354004
## stat109 2.018e-02 7.218e-02 0.280 0.779809
## stat110 -9.673e-01 7.155e-02 -13.519 < 2e-16 ***
## stat111 4.178e-02 7.245e-02 0.577 0.564185
## stat112 -9.160e-02 7.253e-02 -1.263 0.206666
## stat113 9.453e-03 7.264e-02 0.130 0.896463
## stat114 5.242e-02 7.229e-02 0.725 0.468432
## stat115 3.961e-02 7.178e-02 0.552 0.581074
## stat116 7.419e-02 7.244e-02 1.024 0.305797
## stat117 4.926e-02 7.228e-02 0.681 0.495583
## stat118 -4.218e-02 7.186e-02 -0.587 0.557227
## stat119 1.669e-02 7.215e-02 0.231 0.817063
## stat120 1.605e-02 7.169e-02 0.224 0.822863
## stat121 -1.236e-02 7.214e-02 -0.171 0.863993
## stat122 -2.769e-02 7.198e-02 -0.385 0.700541
## stat123 7.683e-02 7.339e-02 1.047 0.295178
## stat124 -2.495e-02 7.216e-02 -0.346 0.729552
## stat125 8.646e-02 7.234e-02 1.195 0.232074
## stat126 9.590e-02 7.179e-02 1.336 0.181645
## stat127 -2.855e-03 7.175e-02 -0.040 0.968265
## stat128 -6.034e-02 7.229e-02 -0.835 0.403927
## stat129 -5.230e-03 7.208e-02 -0.073 0.942153
## stat130 3.353e-02 7.221e-02 0.464 0.642464
## stat131 -5.837e-02 7.220e-02 -0.808 0.418842
## stat132 -2.335e-02 7.169e-02 -0.326 0.744666
## stat133 3.967e-02 7.216e-02 0.550 0.582499
## stat134 -3.905e-02 7.170e-02 -0.545 0.586026
## stat135 -3.763e-02 7.218e-02 -0.521 0.602112
## stat136 4.740e-02 7.222e-02 0.656 0.511655
## stat137 -2.850e-02 7.198e-02 -0.396 0.692120
## stat138 5.991e-03 7.182e-02 0.083 0.933528
## stat139 -3.735e-02 7.222e-02 -0.517 0.605050
## stat140 -3.976e-02 7.180e-02 -0.554 0.579813
## stat141 5.683e-02 7.162e-02 0.793 0.427524
## stat142 -4.771e-02 7.292e-02 -0.654 0.512962
## stat143 3.615e-03 7.176e-02 0.050 0.959826
## stat144 1.395e-01 7.146e-02 1.952 0.051005 .
## stat145 -1.740e-02 7.281e-02 -0.239 0.811102
## stat146 -9.462e-02 7.275e-02 -1.301 0.193457
## stat147 2.570e-02 7.284e-02 0.353 0.724245
## stat148 -6.022e-02 7.113e-02 -0.847 0.397193
## stat149 -1.582e-01 7.302e-02 -2.167 0.030266 *
## stat150 -2.945e-02 7.216e-02 -0.408 0.683185
## stat151 -1.489e-01 7.323e-02 -2.033 0.042139 *
## stat152 -1.146e-01 7.207e-02 -1.591 0.111748
## stat153 4.537e-02 7.260e-02 0.625 0.531993
## stat154 -1.358e-02 7.272e-02 -0.187 0.851826
## stat155 -5.099e-02 7.195e-02 -0.709 0.478562
## stat156 1.594e-01 7.248e-02 2.199 0.027902 *
## stat157 -1.463e-02 7.175e-02 -0.204 0.838401
## stat158 5.818e-03 7.337e-02 0.079 0.936801
## stat159 -2.885e-03 7.195e-02 -0.040 0.968013
## stat160 3.370e-02 7.218e-02 0.467 0.640598
## stat161 7.736e-02 7.299e-02 1.060 0.289272
## stat162 4.477e-03 7.178e-02 0.062 0.950268
## stat163 -1.947e-04 7.268e-02 -0.003 0.997863
## stat164 6.639e-02 7.257e-02 0.915 0.360328
## stat165 -5.776e-03 7.171e-02 -0.081 0.935811
## stat166 -8.617e-02 7.131e-02 -1.208 0.226933
## stat167 -1.795e-02 7.185e-02 -0.250 0.802769
## stat168 -3.778e-02 7.187e-02 -0.526 0.599105
## stat169 4.980e-03 7.181e-02 0.069 0.944716
## stat170 -5.207e-02 7.248e-02 -0.718 0.472482
## stat171 -1.075e-02 7.282e-02 -0.148 0.882621
## stat172 5.985e-02 7.210e-02 0.830 0.406498
## stat173 -1.956e-02 7.224e-02 -0.271 0.786566
## stat174 2.097e-02 7.237e-02 0.290 0.771973
## stat175 -5.772e-02 7.248e-02 -0.796 0.425870
## stat176 2.057e-02 7.172e-02 0.287 0.774213
## stat177 -4.565e-03 7.280e-02 -0.063 0.949999
## stat178 -2.444e-02 7.311e-02 -0.334 0.738148
## stat179 4.643e-02 7.191e-02 0.646 0.518480
## stat180 1.317e-02 7.175e-02 0.184 0.854410
## stat181 6.927e-02 7.268e-02 0.953 0.340576
## stat182 -3.640e-02 7.278e-02 -0.500 0.616987
## stat183 3.659e-02 7.147e-02 0.512 0.608708
## stat184 -8.906e-03 7.281e-02 -0.122 0.902655
## stat185 -5.439e-02 7.143e-02 -0.761 0.446426
## stat186 -5.383e-02 7.241e-02 -0.743 0.457281
## stat187 -1.589e-01 7.143e-02 -2.225 0.026125 *
## stat188 1.925e-02 7.178e-02 0.268 0.788555
## stat189 -2.989e-02 7.196e-02 -0.415 0.677890
## stat190 2.635e-02 7.200e-02 0.366 0.714373
## stat191 -8.420e-03 7.227e-02 -0.116 0.907261
## stat192 5.684e-02 7.291e-02 0.780 0.435687
## stat193 -2.319e-02 7.275e-02 -0.319 0.749874
## stat194 -5.032e-02 7.172e-02 -0.702 0.482993
## stat195 9.792e-02 7.211e-02 1.358 0.174560
## stat196 1.555e-02 7.268e-02 0.214 0.830588
## stat197 2.652e-02 7.123e-02 0.372 0.709692
## stat198 -1.067e-01 7.229e-02 -1.476 0.140091
## stat199 1.164e-02 7.179e-02 0.162 0.871218
## stat200 -1.375e-01 7.144e-02 -1.925 0.054266 .
## stat201 -1.319e-02 7.225e-02 -0.183 0.855161
## stat202 -8.990e-02 7.289e-02 -1.233 0.217460
## stat203 2.161e-02 7.202e-02 0.300 0.764129
## stat204 -1.287e-01 7.196e-02 -1.789 0.073681 .
## stat205 -9.291e-02 7.161e-02 -1.297 0.194545
## stat206 -4.570e-02 7.285e-02 -0.627 0.530471
## stat207 1.125e-01 7.261e-02 1.549 0.121368
## stat208 -2.075e-02 7.268e-02 -0.285 0.775278
## stat209 2.144e-02 7.210e-02 0.297 0.766240
## stat210 -2.033e-03 7.197e-02 -0.028 0.977462
## stat211 -6.263e-02 7.171e-02 -0.873 0.382559
## stat212 -1.068e-02 7.197e-02 -0.148 0.881994
## stat213 -2.998e-02 7.221e-02 -0.415 0.678022
## stat214 -1.414e-01 7.202e-02 -1.964 0.049616 *
## stat215 -1.225e-01 7.220e-02 -1.697 0.089773 .
## stat216 -5.887e-02 7.198e-02 -0.818 0.413500
## stat217 4.466e-02 7.221e-02 0.619 0.536239
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.477 on 5761 degrees of freedom
## Multiple R-squared: 0.2376, Adjusted R-squared: 0.2058
## F-statistic: 7.479 on 240 and 5761 DF, p-value: < 2.2e-16
plot.diagnostics(model.full, data.train)
model.null = lm(grand.mean.formula, data.train)
summary(model.null)
##
## Call:
## lm(formula = grand.mean.formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.436 -7.039 -1.384 5.620 61.624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 125.3483 0.1373 913.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.63 on 6001 degrees of freedom
plot.diagnostics(model.null, data.train)
## hat values (leverages) are all = 0.0001666111
## and there are no factor predictors; no plot no. 5
http://www.stat.columbia.edu/~martin/W2024/R10.pdf
if (algo.forward == TRUE){
t1 = Sys.time()
model.forward = step(model.null, scope=list(lower=model.null, upper=model.full), direction="forward")
print(summary(model.forward))
saveRDS(model.forward,file = "model_forward.rds")
t2 = Sys.time()
print (paste("Time taken for Forward Selection: ",t2-t1, sep = ""))
plot.diagnostics(model.forward, data.train)
}
if (algo.backward == TRUE){
# Takes too much time
t1 = Sys.time()
model.backward = step(model.full, data = data.train, direction="backward")
print(summary(model.backward))
saveRDS(model.forward,file = "model_backward.rds")
t2 = Sys.time()
print (paste("Time taken for Backward Elimination: ",t2-t1, sep = ""))
plot.diagnostics(model.backward, data.train)
}
if (algo.stepwise == TRUE){
t1 = Sys.time()
model.stepwise = step(model.null, scope=list(upper=model.full), data = data.train, direction="both")
print(summary(model.stepwise))
saveRDS(model.forward,file = "model_stepwise.rds")
t2 = Sys.time()
print (paste("Time taken for Stepwise Selection: ",t2-t1, sep = ""))
plot.diagnostics(model.stepwise, data.train)
}
if (algo.LASSO == TRUE){
t1 = Sys.time()
model.LASSO = cv.glmnet(as.matrix(data.train[,feature.names]), data.train[,label.names], nfolds = 5, standardize = TRUE)
summary(model.LASSO)
t2 = Sys.time()
print (paste("Time taken for LASSO: ",t2-t1, sep = ""))
plot(model.LASSO)
best_lambda = model.LASSO$lambda.1se
lasso_coef = model.LASSO$glmnet.fit$beta[ , model.LASSO$glmnet.fit$lambda == best_lambda]
print (lasso_coef)
lasso_coef [ abs(lasso_coef) > 0 ]
}
# summary(model.forward)
# summary(model.stepwise)